Text Classification¶

Basic Steps:¶

Data Cleaning
Import Necessary Libraries
Read the dataset
Visualize the dataset
Find the trend and relation in between features
Convert text into numerical features
Display sparse matrix
Split dataset into training and testing set
Perform learning opertion - fit
Predict accuracy -Score
Validate result - Confusion Matrix, Classification Report
Repeat the process - Till the desirable validation result

from IPython.core.display import Image 
Image(filename='C://Users//datta//Pictures//program_flow.jpg')

# Import All required packages
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt4agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from matplotlib.axes import Subplot
%matplotlib inline
import pandas as pd
import numpy as np
from numpy.random import randn
from scipy import stats 
import requests
import seaborn as sns
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
import warnings

warnings.filterwarnings("ignore")

from ipywidgets import interactive
from IPython.display import Audio, display
import ipywidgets as widgets
from IPython.display import display, clear_output, Javascript
from traitlets import Unicode

# nbconvert related imports
from nbconvert import get_export_names, export_by_name
from nbconvert.writers import FilesWriter
from nbformat import read, NO_CONVERT
from nbconvert.utils.exceptions import ConversionException

notebook_name = widgets.Text()

js = """IPython.notebook.kernel.widget_manager.get_model('%s').then(function(model) {
      model.set('value', IPython.notebook.notebook_name);
    model.save();
});
""" % notebook_name.model_id
display(Javascript(data=js))

filename = notebook_name.value
filename

'Text Classification_MNB_smaller_version.ipynb'

exporter_names = widgets.Dropdown(options=get_export_names(), value='html')
export_button = widgets.Button(description="Export")
download_link = widgets.HTML(visible=False)

taws_df = pd.read_excel('just_3.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

#smaller version
taws_df.count()

CAUSE_LEVEL_1                30248
Description                  30248
Resolution                   23133
Corrective_Action_lessons    27446
General_Catigory             30248
LOCATION                     30248
Merge                        30248
dtype: int64

transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

corpus = taws_df['Merge']

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()
print (X.shape)

(30248, 30382)

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

vectorizer.get_feature_names()

['00',
 '000',
 '0000',
 '0000h',
 '0000hrs',
 '0005',
 '0006',
 '0008',
 '000lb',
 '001',
 '0010',
 '00100lt',
 '0015',
 '0015lt',
 '002',
 '0020',
 '0025',
 '00250lt',
 '0029',
 '003',
 '0030',
 '0035',
 '004',
 '0042lt',
 '0045',
 '0045hrs',
 '005',
 '005s',
 '006',
 '007',
 '009',
 '009591959',
 '00am',
 '00h18',
 '00lt',
 '00m',
 '01',
 '010',
 '0100',
 '0100hrs',
 '0100lt',
 '0105',
 '0105h',
 '011',
 '0110hrs',
 '0115',
 '0118',
 '011vwd',
 '012',
 '0125lt',
 '0130',
 '0130hr',
 '013vwd',
 '014',
 '0145',
 '015',
 '0150',
 '016',
 '0169',
 '017',
 '019',
 '01and',
 '01chk',
 '01fig',
 '01no',
 '01st',
 '02',
 '0200',
 '0200hrs',
 '021',
 '0210',
 '0212',
 '0215lt',
 '022',
 '0220',
 '0221',
 '0224',
 '0228',
 '022vcp',
 '0230',
 '0230hrs',
 '0234',
 '0237',
 '023as',
 '024',
 '0240',
 '0242',
 '0248',
 '025',
 '0250',
 '0255',
 '027Ã¢',
 '02chk',
 '02fig',
 '02frm',
 '03',
 '030',
 '0300',
 '0300lt',
 '0308',
 '031',
 '0310',
 '0313',
 '0315lt',
 '0318lt',
 '032',
 '0327hrs',
 '033',
 '0330',
 '0330h',
 '0335',
 '0338lt',
 '034',
 '0340h',
 '0345h',
 '037',
 '037225',
 '039',
 '03a',
 '03lt',
 '03th',
 '03Ã¢',
 '04',
 '0400',
 '0400am',
 '0400h',
 '0400hrs',
 '0400lt',
 '0405',
 '0406',
 '0406lt',
 '040deg',
 '041',
 '0410h',
 '0410lt',
 '0412h',
 '0415',
 '0420hrs',
 '0420lt',
 '0425lt',
 '0429',
 '043',
 '0430',
 '0430hrs',
 '0435',
 '0440',
 '0445lt',
 '0446',
 '044e57',
 '045',
 '0450',
 '0459',
 '0460',
 '048',
 '0483',
 '049',
 '04th',
 '05',
 '0500',
 '0500lt',
 '0509',
 '0510',
 '0512',
 '0515lt',
 '0518',
 '0519',
 '052',
 '0520lt',
 '0530',
 '0535',
 '054',
 '0545',
 '055',
 '0550',
 '055092557',
 '057',
 '0579',
 '059',
 '05min',
 '05th',
 '06',
 '060',
 '0600',
 '0600hrs',
 '0601',
 '0605hrs',
 '0607',
 '060782',
 '060882',
 '061',
 '0610',
 '0615',
 '0618',
 '0625',
 '063',
 '0630',
 '0635',
 '064',
 '064306',
 '0647',
 '0648',
 '065',
 '0655',
 '068',
 '06h41',
 '06h50',
 '06Ã¢',
 '07',
 '0700',
 '0710',
 '0712',
 '0715',
 '0716',
 '072000',
 '072003',
 '0730',
 '073000',
 '0730hrs',
 '0732',
 '0733',
 '0734',
 '0735',
 '0740',
 '0745',
 '0745hrs',
 '079',
 '08',
 '0800',
 '0800h',
 '0800hrs',
 '0800lt',
 '080t',
 '0810',
 '081003',
 '0810lt',
 '0811',
 '0815',
 '0816',
 '0817',
 '082',
 '0824',
 '0830',
 '0830h',
 '0835',
 '0836',
 '0840',
 '0840h',
 '0840hrs',
 '0845',
 '085',
 '0850',
 '087',
 '088',
 '08h30',
 '08lt',
 '08nm',
 '08th',
 '09',
 '090',
 '0900',
 '0900h',
 '0900hrs',
 '0900l',
 '0900lt',
 '0903',
 '0905',
 '090deg',
 '0910',
 '091000',
 '091001',
 '091005',
 '0912',
 '0915',
 '0915hrs',
 '0918',
 '0924',
 '0925',
 '0929',
 '0930',
 '0930h',
 '0930lt',
 '0935',
 '0940',
 '0945',
 '0945hrs',
 '0948',
 '0949',
 '095',
 '0950',
 '0950lt',
 '0951',
 '0952',
 '0954',
 '0e',
 '0kg',
 '0m',
 '0mm',
 '0mt',
 '0mtrs',
 '0n',
 '0ne',
 '0nm',
 '0ppm',
 '0r',
 '0t',
 '0x',
 '10',
 '100',
 '1000',
 '10000',
 '1000h',
 '1000hrs',
 '1000lt',
 '1000rpm',
 '1002',
 '1003',
 '1005',
 '1005kw',
 '100982',
 '100bar',
 '100c',
 '100m',
 '100ppm',
 '101',
 '101082',
 '1013',
 '1015',
 '1015h',
 '1015hrs',
 '1016',
 '1016mb',
 '1018',
 '102',
 '1020',
 '1024',
 '1025',
 '103',
 '1030',
 '1030am',
 '1030h',
 '1030hrs',
 '1034',
 '1035',
 '1035hrs',
 '1036',
 '1045',
 '1045hrs',
 '104a',
 '105',
 '1050',
 '10500',
 '1053',
 '1055',
 '106',
 '1066',
 '107',
 '1078',
 '1089',
 '109',
 '10cm',
 '10deg',
 '10g',
 '10m',
 '10minutes',
 '10mm',
 '10mts',
 '10nm',
 '10ppm',
 '10t',
 '10utc',
 '10v',
 '10x8',
 '11',
 '110',
 '1100',
 '1100hrs',
 '1105h',
 '110m3',
 '110v',
 '110vac',
 '110volts',
 '111',
 '1115hrs',
 '1118lt',
 '1119',
 '112',
 '1120',
 '1121',
 '1127',
 '1127log',
 '113',
 '1130hrs',
 '1133',
 '1135',
 '1139',
 '114',
 '1140',
 '1144',
 '1147',
 '115',
 '1150',
 '1151',
 '1152',
 '115t',
 '116',
 '1162',
 '1182',
 '119',
 '11oo',
 '11pcs',
 '12',
 '120',
 '1200',
 '1200h',
 '1200hrs',
 '1200mmwg',
 '1205',
 '1206',
 '1207',
 '120ltrs',
 '120of',
 '120v',
 '120vac',
 '121',
 '1210',
 '1210lt',
 '1211',
 '1214',
 '1215',
 '1215hrs',
 '1225',
 '123',
 '1230',
 '1245',
 '1245hrs',
 '125',
 '1254',
 '1255',
 '127',
 '1273',
 '129',
 '12hrs',
 '12kg',
 '12lt',
 '12mins',
 '12mm',
 '12months',
 '12mt',
 '12mts',
 '12n24',
 '12nn',
 '12p',
 '12t',
 '12th',
 '12x4',
 '13',
 '130',
 '1300',
 '13000rpm',
 '1300hrs',
 '1300lt',
 '1305',
 '130c',
 '131',
 '1312hrs',
 '1313',
 '1314',
 '1315',
 '1315hrs',
 '1320',
 '1320h',
 '1325',
 '1325hrs',
 '133',
 '1330',
 '1330hrs',
 '1330lt',
 '1336hrs',
 '1340',
 '1344',
 '1345',
 '1345h',
 '1345hrs',
 '1345lt',
 '1346',
 '1348',
 '1349',
 '135',
 '1350',
 '1353',
 '1354',
 '1354hrs',
 '1355',
 '1357',
 '135l',
 '136',
 '13600rpm',
 '1366',
 '137',
 '1379',
 '138',
 '1383',
 '1385',
 '1386',
 '1389',
 '139',
 '1398',
 '13cm',
 '13th',
 '14',
 '140',
 '1400',
 '14001',
 '1400hrs',
 '1406',
 '140bar',
 '140c',
 '140t',
 '141',
 '1410h',
 '1418',
 '142',
 '1420',
 '1420h',
 '1422hrs',
 '1426hrs',
 '1428',
 '143',
 '1430',
 '1430l',
 '1430lt',
 '1435',
 '1435h',
 '1435hrs',
 '1442',
 '1442hrs',
 '1444',
 '1445',
 '1445lt',
 '1447hrs',
 '1448',
 '1448hrs',
 '145',
 '1450',
 '1455lt',
 '1459',
 '146',
 '147',
 '148',
 '14m',
 '14mmx8mm',
 '14th',
 '15',
 '150',
 '1500',
 '1500bar',
 '1500hrs',
 '1500lt',
 '150bars',
 '150cm',
 '150ltr',
 '150m',
 '150mm',
 '150v',
 '151',
 '1510h',
 '1510lt',
 '1515',
 '151g',
 '1524',
 '1525lt',
 '152g139',
 '1530',
 '1530h',
 '1530hrs',
 '1532',
 '1533',
 '1536',
 '1540',
 '1545',
 '1545lt',
 '154m',
 '155',
 '1550',
 '1552',
 '1553hrs',
 '15cm',
 '15hrs',
 '15kg',
 '15lt',
 '15m',
 '15nm',
 '15ppm',
 '15sec',
 '15th',
 '15yrs',
 '16',
 '160',
 '1600',
 '16000',
 '1600h',
 '1605',
 '1607',
 '1610',
 '1620',
 '1625',
 '1630',
 '1630h',
 '1630hrs',
 '1630lt',
 '164',
 '1640',
 '1642',
 '1645lt',
 '1650',
 '1653',
 '1655',
 '165Ã¢',
 '166',
 '167',
 '168',
 '169',
 '16a',
 '16kpa',
 '16m',
 '16mm',
 '16th',
 '16Ã¢Âº',
 '17',
 '170',
 '1700',
 '1700h',
 '1700hrs',
 '1706',
 '1706lt',
 '1710',
 '1712',
 '1715',
 '173',
 '1730',
 '1730lt',
 '1736',
 '173cm',
 '174',
 '175',
 '1755',
 '1757',
 '175Ã¢',
 '176',
 '177',
 '1773',
 '17a',
 '17oohrs',
 '17sb',
 '17t',
 '17th',
 '18',
 '180',
 '1800',
 '1800hrs',
 '1800lt',
 '1805',
 '1810',
 '1812',
 '1815',
 '1815hrs',
 '1820',
 '1825',
 '1826',
 '183',
 '1830',
 '1830hrs',
 '1833',
 '1836hrs',
 '1838hrs',
 '1840',
 '1842',
 '1845',
 '1848',
 '185',
 '1850',
 '1850hrs',
 '1875',
 '188',
 '18h42',
 '18m',
 '18th',
 '19',
 '190',
 '1900',
 '1900hrs',
 '190111',
 '190112',
 '1905',
 '1909',
 '1910h',
 '1915',
 '192',
 '1920h',
 '1922',
 '1923',
 '1928',
 '1930',
 '1930hrs',
 '1932',
 '1942',
 '1945',
 '1950',
 '1950h',
 '1952',
 '1974',
 '199',
 '1993',
 '19th',
 '1a',
 '1ae',
 '1assit',
 '1c',
 '1cable',
 '1ch',
 '1cooling',
 '1cub',
 '1day',
 '1db',
 '1dbwbt',
 '1ft',
 '1gen',
 '1h',
 '1hr',
 '1kg',
 '1ltrs',
 '1m',
 '1m3',
 '1meter',
 '1metre',
 '1mm',
 '1month',
 '1mtr',
 '1nm',
 '1no',
 '1o',
 '1oe',
 '1on',
 '1p',
 '1pc',
 '1s',
 '1spring',
 '1st',
 '1t',
 '1ton',
 '1was',
 '1x',
 '1x2',
 '1Ã¢',
 '1Ã¢Â½',
 '20',
 '200',
 '2000',
 '2000h',
 '2000hrs',
 '2000lt',
 '2000ppm',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '200l',
 '200litre',
 '200ltrs',
 '200m',
 '200mm',
 '200mtrs',
 '200ppm',
 '201',
 '2010',
 '2010hrs',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015hrs',
 '2015lt',
 '2019',
 '201a',
 '202',
 '2020',
 '203',
 '2030',
 '2030hrs',
 '2040',
 '2042',
 '205',
 '2055',
 '2059',
 '209',
 '20cm',
 '20deg',
 '20degs',
 '20ft',
 '20kg',
 '20knot',
 '20kv',
 '20l',
 '20litres',
 '20lt',
 '20ltr',
 '20ltrs',
 '20m',
 '20man',
 '20min',
 '20mins',
 '20mm',
 '20mtrs',
 '20nm',
 '20seconds',
 '20th',
 '20tonnes',
 '20Ã¢',
 '20Ã¢Âº',
 '21',
 '210',
 '2100',
 '2100hrs',
 '2100i',
 '2103',
 '2107',
 '210deg',
 '2115',
 '2115lt',
 '2118',
 '212',
 '2120',
 '2122',
 '2124',
 '213',
 '2130',
 '2135h',
 '2138',
 '2139',
 '214',
 '2140lt',
 '2142',
 '2145hrs',
 '2148',
 '2154',
 '2159',
 '215deg',
 '216',
 '218',
 '21st',
 '21x29x50',
 '21x29x50cm',
 '22',
 '220',
 '2200',
 '22000',
 '2200hrs',
 '2200hts',
 '2200lt',
 '2201',
 '220v',
 '220vac',
 '220volts',
 '2210',
 '2210lt',
 '2211',
 '2215',
 '2220hrs',
 '223',
 '2230',
 '2230lt',
 '224Ã¢',
 '2251',
 '2255',
 '2257',
 '226',
 '228',
 '229f',
 '229Ã¢',
 '22aug',
 '22nd',
 '23',
 '230',
 '2300',
 '2300hrs',
 '2300lt',
 '2305',
 '2308490',
 '230deg',
 '230f',
 '230v',
 '232',
 '2320',
 '2329',
 '2330',
 '2330lt',
 '2335',
 '2335hrs',
 '233b',
 '234',
 '235',
 '2350',
 '23529kw',
 '2354',
 '2371',
 '23a',
 '23b',
 '23d',
 '23e',
 '23i',
 '23lt',
 '23m',
 '23mpa',
 '23rd',
 '24',
 '240',
 '2400',
 '2416',
 '241f',
 '2421',
 '245',
 '2450',
 '2470',
 '248',
 '24hrs',
 '24lt',
 '24th',
 '24v',
 '24volt',
 '24volts',
 '25',
 '250',
 '2500',
 '250mt',
 '250v',
 '254v',
 '255Ã¢',
 '257',
 '258',
 '25cub',
 '25dec',
 '25hp',
 '25kg',
 '25knots',
 '25kts',
 '25ltr',
 '25m',
 '25metres',
 '25nm',
 '25rpm',
 '25t',
 '25th',
 '26',
 '261',
 '261Ã¢',
 '267',
 '268',
 '26th',
 '26Ã¢',
 '27',
 '270',
 '2700',
 '270Ã¢',
 '2735',
 '275',
 '2782',
 '2790',
 '27f',
 '27g',
 '27m',
 '27rpm',
 '27th',
 '28',
 '280',
 '2800',
 '280nm',
 '281',
 '2810',
 '2845',
 '285',
 '288',
 '28th',
 '29',
 '290',
 '292',
 '29aug09',
 '29cfr',
 '29mar',
 '29th',
 '2a',
 '2ae',
 '2aux',
 '2cables',
 '2ct',
 '2de',
 '2deck',
 '2e',
 ...]

causes = taws_df["CAUSE_LEVEL_1"].unique()
cause_dict = {value:index for index, value in enumerate(causes)}
y = taws_df["CAUSE_LEVEL_1"].map(cause_dict)
cause_dict

{'     nb      vcnb  ': 5,
 'ENVIRONMENT / WEATHER': 1,
 'EQUIPMENT': 2,
 'ORGANIZATION': 4,
 'PEOPLE': 0,
 'UNSPECIFIED': 3}

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)

clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.78347107438016528

clf_dt = tree.DecisionTreeClassifier(max_depth=10)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)

0.77669421487603307

y_test

16543    0
20613    2
18600    2
6794     0
11416    0
14463    0
2905     0
13024    0
3078     0
8951     0
8552     0
6317     0
13168    0
6894     0
13856    0
8812     0
9250     0
3932     2
16353    0
20314    0
11105    0
6859     2
24919    2
2611     2
10683    0
12022    2
20952    2
17558    2
11991    0
26854    2
        ..
27557    1
25124    0
10219    0
13860    0
19152    4
8399     0
4758     0
27307    0
28275    0
28813    0
9055     0
4299     0
13396    0
5292     1
17231    0
22747    0
7715     0
9271     0
20913    0
26602    2
29489    0
12000    0
11975    0
16706    2
6351     2
14968    0
7554     0
5515     4
3848     0
9198     0
Name: CAUSE_LEVEL_1, Length: 6050, dtype: int64

y_pred = clf.predict(X_test)

shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

cm = confusion_matrix(y_test, y_pred)
cm

array([[3985,    5,  552,    0,   98],
       [  77,    9,   81,    0,    5],
       [ 310,    1,  677,    0,   25],
       [   6,    1,    8,    1,    1],
       [  85,    1,   54,    0,   68]])

plt.matshow(cm)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

target_names = ['PEOPLE', 'ENVIRONMENT', 'EQUIPMENT', 'ORGANISATION', 'UNSPECIFIED']

print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      PEOPLE       0.89      0.86      0.88      4640
 ENVIRONMENT       0.53      0.05      0.10       172
   EQUIPMENT       0.49      0.67      0.57      1013
ORGANISATION       1.00      0.06      0.11        17
 UNSPECIFIED       0.35      0.33      0.34       208

 avg / total       0.80      0.78      0.78      6050

test_classifier(clf)

Accuracy: 0.7717 (+/- 0.00)

sample_dataframe = taws_df.sample(n=200)

pwas_df = pd.read_excel('200ex.xlsx', 'Sheet1', index_col=None)

pwas_df.head()

n = 0

for n in range(200):
    pm = pwas_df.ix[n]
    vect_pm = vectorizer.transform(pm).toarray()
    m = clf.predict(vect_pm)
    print(m)

[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[2]
[0]
[0]
[2]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[4]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[2]
[0]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[2]
[0]
[2]
[4]
[0]
[0]
[2]
[2]
[2]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[0]
[0]
[0]
[2]
[0]
[2]
[2]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]

cons_df = pd.read_excel('machine_vs_human.xlsx', 'Sheet1', index_col=None)

m_test = cons_df['humnan']

m_pred = cons_df['machine']

cm2 = confusion_matrix(m_test, m_pred)

cm2

array([[152,   0,  10,   0,   0],
       [  4,   3,   0,   0,   0],
       [  4,   0,  18,   0,   0],
       [  0,   0,   1,   0,   0],
       [  4,   0,   2,   0,   2]])

plt.matshow(cm2)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(classification_report(m_test, m_pred, target_names=target_names))

              precision    recall  f1-score   support

      PEOPLE       0.93      0.94      0.93       162
 ENVIRONMENT       1.00      0.43      0.60         7
   EQUIPMENT       0.58      0.82      0.68        22
ORGANISATION       0.00      0.00      0.00         1
 UNSPECIFIED       1.00      0.25      0.40         8

 avg / total       0.89      0.88      0.87       200

This is very useful tool to share and convert the file into different file format. source

file_writer = FilesWriter()

def export(name, nb):
    
    # Get a unique key for the notebook and set it in the resources object.
    notebook_name = name[:name.rfind('.')]
    resources = {}
    resources['unique_key'] = notebook_name
    resources['output_files_dir'] = '%s_files' % notebook_name

    # Try to export
    try:
        output, resources = export_by_name(exporter_names.value, nb)
    except ConversionException as e:
        download_link.value = "<br>Could not export notebook!"
    else:
        write_results = file_writer.write(output, resources, notebook_name=notebook_name)
    
        download_link.value = "<br>Results: <a href='files/{filename}'><i>\"{filename}\"</i></a>".format(filename=write_results)
        download_link.visible = True
        
def handle_export(widget):
    with open(filename, 'r') as f:
        export(filename, read(f, NO_CONVERT))
        
export_button.on_click(handle_export)

display(exporter_names, export_button, download_link)

	Description
0	J/Eng was checking the chemical racks, where h...
1	One stay wire connected to Christmas Tree loos...
2	I found out that Pilot combination ladder (P) ...
3	AT (Time). One crew member kept food in microw...
4	During loading operations in (Location) by shi...